require(magrittr)
require(knitr)
require(dplyr)
require(tidyr)
require(tibble)
require(ggplot2)
require(Seurat)
require(cowplot)
require(kableExtra)
require(readr)
require(readxl)
source("seurat_helper_functions.R")
parent.directory <- "intermediate/"
parent.directory.mtx <- "filtered/"

A. GC and ASC extraction

A.1. Getting cell identifiers from large Seurat object and re-running analysis

We take our GC and antibody-secreting cell (ASC) identifiers from our orignal Seurat dataset. We then get the information from the original 10X files, only taking the cell identifiers from the GC and ASCs.

load(file = paste0(parent.directory, "sct_cluster_metadata.RData"))
GCPC_metadata %>%
  filter(parent_cluster != "B PLCG2") -> GCPC_metadata
list_identifiers <- c("TC124.1", "TC124.2", "TC124.3", "TC125", "TC126")
cell_barcodes_list <- lapply(1:length(list_identifiers), function(i){
  GCPC_metadata %>%
    filter(orig_ident == list_identifiers[i]) %>%
    select(identifier) %>%
    mutate(raw_barcode = sub(paste0(list_identifiers[i], "_"), "", identifier)) %>%
    pull(raw_barcode) -> raw_barcodes
  return(raw_barcodes)
})
TC124.1.data <- Read10X(data.dir = paste0(parent.directory.mtx, "Amit1"), strip.suffix = TRUE)
TC124.2.data <- Read10X(data.dir = paste0(parent.directory.mtx, "Amit7"), strip.suffix = TRUE)
TC124.3.data <- Read10X(data.dir = paste0(parent.directory.mtx, "Amit8"), strip.suffix = TRUE)
TC125.data <- Read10X(data.dir = paste0(parent.directory.mtx, "Amit2"), strip.suffix = TRUE)
TC126.data <- Read10X(data.dir = paste0(parent.directory.mtx, "Amit3"), strip.suffix = TRUE)

B.names <- c("TC124.1", "TC124.2", "TC124.3", "TC125", "TC126")
B.list <- list(TC124.1.data, TC124.2.data, TC124.3.data, TC125.data, TC126.data)
rm(list = ls()[grepl(".data", ls(), fixed = TRUE)])

B.list <- lapply(1:length(B.list), function(i){
  B.list[[i]] <- B.list[[i]][,cell_barcodes_list[[i]]]
  colnames(B.list[[i]]) <- paste0(B.names[i], "_", colnames(B.list[[i]]))
  return(CreateSeuratObject(B.list[[i]], project = B.names[[i]], min.cells = 3, min.features = 200))
})

A.2. Re-run SCTransform normalization and integration

Now that we have our new Seurat objects, we will add metadata and then will re-perform our SCTransform normalization and the Seurat v3 integration algorithm.

read_excel("dissociation_genes.xlsx") %>% select('Human ID') %>% na.omit %>% pull("Human ID") -> dissociation.genes

for(i in 1:length(B.list)){
  temp.dissoc.genes <- dissociation.genes[dissociation.genes %in% rownames(B.list[[i]])]
  B.list[[i]][["percent.dissoc"]] <- PercentageFeatureSet(B.list[[i]], features = temp.dissoc.genes)
  B.list[[i]][["percent.mt"]] <- PercentageFeatureSet(B.list[[i]], pattern = "^MT-")
  B.list[[i]][["donor"]] <- substring(B.list[[i]]$orig.ident, 1, 5)
}
#This removes the unused data thus far
rm(i, temp.dissoc.genes)

B.list <- list(merge(x = B.list[[1]], y = B.list[2:3]), B.list[[4]], B.list[[5]])
B.list <- lapply(1:length(B.list), function(i){
  return(SCTransform(B.list[[i]], verbose = TRUE))
})
B.features <- SelectIntegrationFeatures(object.list = B.list, nfeatures = 3000)
B.list <- PrepSCTIntegration(object.list = B.list, anchor.features = B.features, verbose = TRUE)
B.anchors <- FindIntegrationAnchors(object.list = B.list, normalization.method = "SCT", anchor.features = B.features, dims = 1:40, verbose = TRUE)
B.gcpc <- IntegrateData(anchorset = B.anchors, normalization.method = "SCT", dims = 1:40, verbose = TRUE) 
save(B.gcpc, GCPC_metadata, file = paste0(parent.directory, "sct_gcpc_integrated.RData"))

A.3. Re-run PCA, UMAP, and Louvain clustering on smaller dataset.

We perform PCA and UMAP and Louvain clustering just as before. We will also perform Louvain clustering at a single resolution, followed by scoring of specific gene sets just as in the original Seurat dataset.

load(file = paste0(parent.directory, "sct_gcpc_integrated.RData"))
B.gcpc[["SCT"]] <- NULL
IG.genes <- grep("^IGL|^IGK|^IGHV", B.gcpc@assays$integrated@var.features, value = TRUE)
B.gcpc@assays$integrated@var.features <- setdiff(B.gcpc@assays$integrated@var.features, c(IG.genes))
B.gcpc <- RunPCA(B.gcpc, npcs = 50, verbose = TRUE)
dims.to.use <- 1:30
B.gcpc <- BuildAnnoyUMAP(B.gcpc,
                         reduction = "pca",
                         metric = "euclidean",
                         dims = dims.to.use,
                         reduction_name = "pca_UMAP",
                         reduction_key = "pcaumap_",
                         return_graphs = TRUE)
DefaultAssay(B.gcpc) <- "RNA"
B.gcpc <- NormalizeData(B.gcpc)

B.gcpc <- FindClusters(B.gcpc, resolution = 0.7, graph.name = "annoy_nn_snn")
B.gcpc$previous_idents <- plyr::mapvalues(rownames(B.gcpc@meta.data), from = GCPC_metadata$identifier, to = as.character(GCPC_metadata$parent_cluster))

save(B.gcpc, file = paste0(parent.directory,"sct_seurat_gcpc_clustered.RData"))

A.4. Finding marker genes for the new clusters.

We use the Seurat FindAllMarkers function and SoupX to find two sets of marker genes for our new clusters.

GCPC_RNA_markers <- FindAllMarkers(B.gcpc, assay = "RNA", logfc.threshold = 0.3, test.use = "LR", verbose = TRUE, only.pos = FALSE, latent.vars = "donor")
GCPC_SoupX_markers <- SoupX::quickMarkers(B.gcpc[["RNA"]]@data, B.gcpc$seurat_clusters, N = 20)
save(GCPC_RNA_markers, GCPC_SoupX_markers, file = paste0(parent.directory, "sct_seurat_gcpc_markers.RData"))

B. Data visualization

B.1 Cluster visualization full

We can now visualize the new clusters and the previous clusters.

load(paste0(parent.directory,"sct_seurat_gcpc_clustered.RData"))
load(paste0(parent.directory,"sct_seurat_gcpc_markers.RData"))
DimPlot(B.gcpc, reduction = "pca_UMAP", label = TRUE, label.size = 5, label.box = TRUE, pt.size = 0.8, group.by = "previous_idents")
DimPlot(B.gcpc, reduction = "pca_UMAP", label = TRUE, label.size = 5, label.box = TRUE, pt.size = 0.8)

B.2 SoupX markers

We will visualize the SoupX markers here.

cluster_list <- sort(unique(B.gcpc$seurat_clusters))
n_clusters <- length(cluster_list)
mrks <- SoupX::quickMarkers(B.gcpc[["RNA"]]@data, B.gcpc$seurat_clusters, N = 20)
mrks_list <- lapply(cluster_list, function(i){
 mrks %>%
   dplyr::filter(cluster == i) %>%
   dplyr::pull(gene)
})
dotplot_list <- lapply(1:n_clusters, function(i) {
  cluster_name <- cluster_list[i]
  temp.slot <- paste0("Cluster ", cluster_name)
  if(length(mrks_list[[{{i}}]]) == 0){return(NULL)}
  src <- c("#### {{temp.slot}} {.unnumbered}",
           "```{r soupdotplot-{{i}}, fig.width = 12, fig.height = 6}",
           "DotPlot(B.gcpc, features = mrks_list[[{{i}}]], assay = \"RNA\")+RotatedAxis()",
           "```",
           "")
  knit_expand(text = src)
})
out <- knit_child(text = unlist(dotplot_list), options = list(echo = FALSE, cache = FALSE))

Cluster 0

Cluster 1

Cluster 2

Cluster 3

Cluster 4

Cluster 5

Cluster 6

Cluster 7

Cluster 8

Cluster 9

Cluster 10

Cluster 11

Cluster 12

B.3 Seurat markers

DefaultAssay(B.gcpc) <- "RNA"
cluster_list <- levels(GCPC_RNA_markers$cluster)
n_clusters <- length(cluster_list)
gene_list <- lapply(cluster_list, function(i){
  GCPC_RNA_markers %>%
    dplyr::filter(p_val_adj < 0.01) %>%
    dplyr::filter(cluster == i) %>%
    dplyr::top_n(20, wt = avg_log2FC) %>%
    dplyr::pull(gene)
})
dotplot_list <- lapply(1:n_clusters, function(i) {
  cluster_name <- cluster_list[i]
  temp.slot <- paste0("Cluster ", cluster_name)
  if(length(gene_list[[i]]) == 0 ){
    return(NULL)
  }
  src <- c("#### {{temp.slot}} {.unnumbered}",
           "```{r dotplot-{{i}}, fig.width = 12, fig.height = 6}",
           "DotPlot(B.gcpc, features = gene_list[[{{i}}]])+RotatedAxis()+scale_color_viridis_c(direction = -1, option = \"A\")",
           "```",
           "")
  knit_expand(text = src)
})
out <- knit_child(text = unlist(dotplot_list), options = list(echo = FALSE, cache = FALSE))

Cluster 0

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Cluster 1

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Cluster 2

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Cluster 3

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Cluster 4

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Cluster 5

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Cluster 6

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Cluster 7

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Cluster 8

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Cluster 9

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Cluster 10

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Cluster 11

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Cluster 12

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

B.4 Exploring marker genes for each cluster

We include tables of the marker genes found in each cluster.

cluster_vector <- levels(B.gcpc@active.ident)
n_clusters <- length(cluster_vector)

markers.list <- lapply(1:n_clusters, function(i) {
  GCPC_RNA_markers %>%
    dplyr::filter(cluster == cluster_vector[i] & p_val_adj < 0.01) %>%
    dplyr::arrange(-avg_log2FC) %>%
    dplyr::select(cluster, gene, avg_log2FC, p_val, p_val_adj) %>%
    dplyr::mutate_if(is.numeric, format, digits = 3)
})

names(markers.list) <- paste("Cluster", cluster_vector)
markers.list <- c(markers.list, "ALL" = list(GCPC_RNA_markers %>%  dplyr::arrange(-avg_log2FC) %>% select(cluster, gene, avg_log2FC, p_val, p_val_adj) %>% mutate_if(is.numeric, format, digits = 3)))

src_list <- lapply(1:(n_clusters+1), function(i) {
  if(i == (n_clusters+1)){
    label <- "ALL"
  } else {
    label <- cluster_vector[i]
  }
  src <- c("#### {{label}} {.unnumbered} ",
           "<div style = \"width:65%; height:auto; align:left\">",
           "```{r marker-cluster-{{label}}, fig.width=3}",
           "DT::datatable(markers.list[[{{i}}]], rownames = FALSE)",
           "```",
           "</div>",
           "")
  knit_expand(text = src)
})
out <- knit_child(text = unlist(src_list), options = list(echo = FALSE, cache = FALSE))

0

1

2

3

4

5

6

7

8

9

10

11

12

ALL

## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

B.5 Cluster renaming

The cluster identities remain largely the same as before.

#load(file = paste0(parent.directory, "sct_seurat_gcpc_clustered.RData"))
cluster_levels <- paste0("C",c(1,8,4,12,7,6,2,10,9,5,3,11,0))
cluster_names <- c("GC 1",
                   "GC 2",
                   "GC 3",
                   "GC IgA",
                   "LZ",
                   "DZ 1",
                   "DZ 2",
                   "DZ 3", 
                   "DZ 4",
                   "DZ 5",
                   "GC LMO2",
                   "ASC IgM",
                   "ASC IgG")
B.gcpc$new_clusters_nums <- factor(paste0("C", as.character(B.gcpc$annoy_nn_snn_res.0.7)),
                                   levels = cluster_levels)
B.gcpc$cluster_names <- plyr::mapvalues(B.gcpc$new_clusters_nums,
                                        from = cluster_levels,
                                        to = cluster_names)
Idents(B.gcpc) <- "cluster_names"
DimPlot(B.gcpc, label = TRUE, label.box = TRUE, repel = TRUE, cols = c("dodgerblue", "forestgreen", RColorBrewer::brewer.pal(11, "Set3")))+NoLegend()
save(B.gcpc, file = paste0(parent.directory, "sct_seurat_gcpc_named_clusters.RData"))
load(file = paste0(parent.directory, "sct_seurat_gcpc_named_clusters.RData"))
DimPlot(B.gcpc, label = TRUE, label.box = TRUE, repel = TRUE, cols = c("dodgerblue", "forestgreen", RColorBrewer::brewer.pal(11, "Set3")))+NoLegend()